Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)Libraries
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)Paths
metadata_path <- "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <- "../Crypto_Desjardins_Ashton/results_clean/02.Dataset/chromosomes.csv"
chrom_lengths_path <- "data/processed/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <- "../Crypto_Desjardins/results_clean/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <- "../Crypto_Ashton/results_clean/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <- "../Crypto_Desjardins_Ashton/results_clean/02.Dataset/cnv/cnv_calls.tsv"
duplications_path <- "results/tables/duplications_putative.tsv"
repeats_path_prefix <- "../Crypto_Desjardins/results_clean/03.References/"Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99 .
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.
metadata <- read.delim(
metadata_path,
header=TRUE,
sep=",",
stringsAsFactor = TRUE)
metadata <- metadata %>%
select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
filter(!strain == "H99") %>%
group_by(dataset, lineage)%>%
mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
ungroup() %>%
group_by(lineage)%>%
mutate(samples_in_lineage = n_distinct(sample))%>%
ungroup()%>%
mutate(total_samples = n_distinct(sample))%>%
droplevels()Get the nice chromosome names from the chromosomes file in WeavePop.
chromosome_names = read.delim(
chromosomes_path,
header=TRUE, sep=",")
chromosome_names <- chromosome_names %>%
mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
mutate(chromosome = as.factor(chromosome))
levels(chromosome_names$chromosome) <- paste("chr", chromosome_names$chromosome, sep="")Get the chromosome lengths. Create the file with bash.
# /usr/bin/bash
tail -n +2 ../Crypto_Desjardins/config/chromosomes.csv | \
cut -d',' -f1 | sort | uniq | while read line
do
seqkit fx2tab ../Crypto_Desjardins/data/references/$line.fasta -l -i -n >> data/processed/chromosome_lengths.tsv
donechromosome_lengths = read.delim(
chrom_lengths_path,
header=FALSE,
col.names=c("accession", "length"),
sep="\t")depth_by_chrom_good_desjardins <- read.delim(
depth_by_chrom_good_desj_path,
header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
depth_by_chrom_good_ashton_path,
header=TRUE, sep="\t")
depth_by_chrom_good <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)
depth_by_chrom <- depth_by_chrom_good %>%
select(sample, accession, norm_chrom_median, norm_chrom_mean)Import the file with all called CNVs and keep only the duplications.
cnv_calls <- read.delim(
cnv_calls_path,
header=TRUE, sep="\t")
dup_calls <- filter(cnv_calls, cnv == "duplication") %>%
left_join(chromosome_names, by="accession")
head(dup_calls)| accession | start | end | cnv | region_size | depth | norm_depth | smooth_depth | repeat_fraction | overlap_bp | feature_id | sample | lineage | chromosome |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| chr02_Bt22 | 6001 | 10000 | duplication | 4000 | 491.62 | 8.62 | 2.90 | 0.21 | 832 | SRS881178 | VNBI | chr02 | |
| chr05_Bt22 | 463501 | 475000 | duplication | 11500 | 282.04 | 4.95 | 4.95 | 0.12 | 1401 | CNAG_00127,CNAG_00128,CNAG_01369,CNAG_07313 | SRS881178 | VNBI | chr05 |
| chr07_Bt22 | 606501 | 611000 | duplication | 4500 | 150.03 | 2.63 | 2.47 | 0.80 | 3622 | CNAG_07666 | SRS881178 | VNBI | chr07 |
| chr12_Bt22 | 764001 | 775000 | duplication | 11000 | 95.71 | 1.68 | 1.67 | 0.00 | 0 | CNAG_06985,CNAG_06986,CNAG_07011,CNAG_07918 | SRS881178 | VNBI | chr12 |
| chr07_Bt22 | 1423501 | 1427500 | duplication | 4000 | 172.81 | 2.58 | 1.64 | 0.86 | 3429 | SRS885855 | VNBI | chr07 | |
| chr05_Bt22 | 464001 | 474500 | duplication | 10500 | 116.48 | 2.53 | 2.52 | 0.09 | 901 | CNAG_00127,CNAG_00128,CNAG_01369,CNAG_07313 | SRS885893 | VNBI | chr05 |
The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of the CNV region.
Get the percentage of each chromosome covered by repeats to know how much of a chromosome might not have reliable CNV calls.
lineages <- unique(metadata$lineage)
repeats_all <- data.frame()
for(lineage in lineages){
repeats_path <-paste(
repeats_path_prefix,
lineage, "/", lineage, "_repeats.bed",
sep ="")
repeats <- read.delim(repeats_path,
header=FALSE,
col.names=c("accession", "start", "end", "repeat_type"),
sep="\t")
repeats$lineage <- lineage
repeats_all <- rbind(repeats_all, repeats)
}repeats_percent <- repeats_all %>%
mutate(repeat_size_each = end - start)%>%
group_by(accession, lineage) %>%
summarise(repeat_size = sum(repeat_size_each)) %>%
left_join(chromosome_lengths, by="accession") %>%
left_join(chromosome_names, by=c("accession","lineage")) %>%
mutate(percent_repeat_size = round((repeat_size / length) * 100, 2))%>%
mutate(chromosome = as.factor(chromosome))%>%
select(lineage, accession, chromosome, percent_repeat_size)Most chromosomes have repeats in between 5 and 10% of the size. In VNBII it is closer to 15% for some chormosomes.
ggplot(dup_calls, aes(x = repeat_fraction, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2)+
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Repetitive Fraction vs. Depth",
y = "Normalized Depth",
x = "Fraction of CNV Overlaping with Repetitive Sequences")ggplot(dup_calls, aes(x = region_size, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2)+
scale_x_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Size of CNV vs. Depth",
y = "Normalized Depth",
x = "Size of CNV")ggplot(dup_calls, aes(x = start, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2, scales = "free_x")+
scale_x_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Depth Along Chromosomes",
y = "Normalized Depth",
x = "Position")There is a duplication in many VNI samples in Chr02 with depth from 10 to 70X with a similar size and position, and with more than half of the CNV overlapping with repetitive sequences.
Since that group in chr02 is the only with both depth higher than 10 and repeat fraction higher than 0.5, we use those thresholds to remove that group of CNVs.
dup_calls_filtered <- dup_calls %>%
filter(!(norm_depth > 10 & repeat_fraction > 0.5))ggplot(dup_calls_filtered, aes(x = repeat_fraction, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2)+
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Repetitive Fraction vs. Depth",
y = "Normalized Depth",
x = "Fraction of CNV Overlaping with Repetitive Sequences")ggplot(dup_calls_filtered, aes(x = region_size, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2)+
scale_x_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Size of CNV vs. Depth",
y = "Normalized Depth",
x = "Size of CNV")ggplot(dup_calls_filtered, aes(x = start, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2, scales = "free")+
scale_x_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Depth Along Chromosomes",
y = "Normalized Depth",
x = "Position")There is duplication in many VNI samples in Chr01 with a depth of up to 10X, it is not highly repetitive and it is in the same position.
Explore chr01 to know how to filter that out.
chr1_dup <- dup_calls_filtered %>%
filter(chromosome == "chr01")ggplot(chr1_dup, aes(x = start, y = sample, color = norm_depth)) +
geom_point() +
scale_x_continuous(labels = scales::comma, breaks = seq(0, max(chr1_dup$start), by = 100000)) +
theme_classic() +
theme(legend.position = "bottom",
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "CNVs along Chr01 in all samples colored by depth",
y = "Sample",
x = "Start Position")Make groups of CNVs that start at the same position in the same chromosome.
dup_summary <- dup_calls_filtered %>%
group_by(start, accession, lineage) %>%
summarize(median_depth = mean(norm_depth),
n = n(),
chromosome = first(chromosome),
size = mean(region_size))Print groups in chr01 with median higher than 4.
dup_summary %>%
filter(chromosome == "chr01", median_depth > 4)| start | accession | lineage | median_depth | n | chromosome | size |
|---|---|---|---|---|---|---|
| 1 | CP003820.1 | VNI | 7.885113 | 133 | chr01 | 2101.504 |
| 339001 | CP003820.1 | VNI | 10.320000 | 1 | chr01 | 12500.000 |
| 339501 | CP003820.1 | VNI | 5.839743 | 506 | chr01 | 10930.830 |
Print groups in chr01 that start between the positions 337000 and 340000.
dup_summary %>%
filter(chromosome == "chr01", start > 337000, start < 340000)| start | accession | lineage | median_depth | n | chromosome | size |
|---|---|---|---|---|---|---|
| 339001 | CP003820.1 | VNI | 10.320000 | 1 | chr01 | 12500.00 |
| 339501 | CP003820.1 | VNI | 5.839743 | 506 | chr01 | 10930.83 |
Filter out the CNVs in chr01 that start between the positions 337000 and 340000.
dup_calls_filtered <- dup_calls_filtered %>%
filter(!(chromosome == "chr01" & start > 337000 & start < 340000))ggplot(dup_calls_filtered, aes(x = repeat_fraction, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2)+
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Repetitive Fraction vs. Depth",
y = "Normalized Depth",
x = "Fraction of CNV Overlaping with Repetitive Sequences")ggplot(dup_calls_filtered, aes(x = region_size, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2)+
scale_x_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Size of CNV vs. Depth",
y = "Normalized Depth",
x = "Size of CNV")ggplot(dup_calls_filtered, aes(x = start, y = norm_depth)) +
geom_point(aes(color = lineage)) +
facet_wrap(~chromosome, ncol = 2, scales = "free")+
scale_x_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "bottom")+
labs(title = "Depth Along Chromosomes",
y = "Normalized Depth",
x = "Position")The dataframe all_metrics contains a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs are NAs.
all_metrics <- left_join(depth_by_chrom, dup_calls_filtered,
by = c("sample", "accession"))%>%
select(-chromosome, -lineage)%>%
left_join(chromosome_lengths, by = "accession") %>%
left_join(chromosome_names,by ="accession") %>%
left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)| sample | accession | norm_chrom_median | norm_chrom_mean | start | end | cnv | region_size | depth | norm_depth | smooth_depth | repeat_fraction | overlap_bp | feature_id | length | lineage | chromosome | strain | source | dataset | vni_subdivision | samples_in_dataset_lineage | samples_in_lineage | total_samples |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SRS404518 | CP003820.1 | 0.98 | 0.97 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2291499 | VNI | chr01 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003821.1 | 0.98 | 1.12 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1621675 | VNI | chr02 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003822.1 | 0.99 | 1.01 | 1533001 | 1537000 | duplication | 4000 | 1246.32 | 11.64 | 5.32 | 0.08 | 312 | CNAG_06925,CNAG_06926 | 1575141 | VNI | chr03 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003823.1 | 1.00 | 0.99 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1084805 | VNI | chr04 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003824.1 | 0.99 | 0.98 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1814975 | VNI | chr05 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003825.1 | 1.00 | 0.99 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1422463 | VNI | chr06 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
Summarize the metrics of the called CNVs per chromosome. And keep the rest of the chromosome metrics.
chrom_metrics <- all_metrics %>%
group_by(dataset,lineage,
samples_in_lineage, samples_in_dataset_lineage,
total_samples,sample,
strain,source,
accession,chromosome, length, norm_chrom_mean, norm_chrom_median) %>%
summarise(total_cnv_size = sum(region_size),
n_cnvs = n(),
first = min(start),
last = max(end),
mean_cnv_depth = round(mean(norm_depth),2),
median_cnv_depth = round(median(norm_depth),2),
repeat_size = sum(overlap_bp)) %>%
ungroup()%>%
mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
dup_span_size = last - first,
dup_span_percent = round((dup_span_size / length) * 100, 2),
dup_repeat_percent = round((repeat_size / length) * 100, 2),
chromosome = as.factor(chromosome),
dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))cat("The total number of chromosomes in all samples in the dataset is:", nrow(chrom_metrics))The total number of chromosomes in all samples in the dataset is: 14770
cat("The total number of chromosomes in all samples with called duplications is:", nrow(chrom_metrics %>% filter(!is.na(total_cnv_size))))The total number of chromosomes in all samples with called duplications is: 3865
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = n_cnvs)) +
geom_point(aes(color = dup_span_percent)) +
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "BuPu", direction = 1, name = "Percent Spanned\nby CNVs") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Number of CNVs",
y = "Number of CNVs in Chromosome",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = n_cnvs)) +
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = n_cnvs)) +
facet_wrap(~chromosome, ncol = 3)+
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = dup_repeat_percent)) +
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "YlOrRd", direction = 1, trans = "log2", name = "Percent of CNV\nOverlaped by Repeats") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")Warning in scale_color_distiller(palette = "YlOrRd", direction = 1, trans =
"log2", : log-2 transformation introduced infinite values.
ggplot(chrom_metrics)+
geom_histogram(aes(dup_coverage_percent), binwidth = 1)+
scale_x_continuous(breaks = seq(0,100, by = 5)) +
theme_minimal() +
labs(title = "Histogram of Percent of Chromosome Covered by CNVs",
y = "Count",
x = "Percent of Chromosome Covered by CNVs")ggplot(filter(chrom_metrics, dup_coverage_percent > 0))+
geom_histogram(aes(dup_coverage_percent), binwidth = 1)+
scale_x_continuous(breaks = seq(0,100, by = 5)) +
theme_minimal() +
labs(title = "Histogram of Percent of Chromosome Covered by CNVs",
subtitle = "Only values above 0%",
y = "Count",
x = "Percent of Chromosome Covered by CNVs")ggplot(filter(chrom_metrics, dup_coverage_percent > 15))+
geom_boxplot(aes(x = chromosome, y = dup_coverage_percent))+
geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
theme_minimal() +
labs(title = "Boxplot of Percent of Chromosome Covered by CNVs per Chromosome",
subtitle = "Only values above 15%",
y = "Count",
x = "Percent of Chromosome Covered by CNVs")ggplot(filter(chrom_metrics, dup_span_percent > 0))+
geom_histogram(aes(dup_span_percent), binwidth = 1)+
scale_x_continuous(breaks = seq(0,100, by = 5)) +
theme_minimal() +
labs(title = "Histogram of Percent of Chromosome Spanned by CNVs",
subtitle = "Only values above 0%",
y = "Count",
x = "Percent of Chromosome Spanned by CNVs")ggplot(chrom_metrics)+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(filter(chrom_metrics, norm_chrom_median > 1))+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
subtitle = "Only values above 1",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(filter(chrom_metrics, norm_chrom_median > 1.2))+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
subtitle = "Only values above 1.2",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(chrom_metrics)+
geom_histogram(aes(median_cnv_depth), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0, max(chrom_metrics$median_cnv_depth, na.rm = TRUE), by = 0.5)) +
theme_minimal() +
labs(title = "Histogram of Median Depth of CNVs",
y = "Count",
x = "Median Depth of CNVs")Warning: Removed 10905 rows containing non-finite outside the scale range
(`stat_bin()`).
The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each CNV region.
The Median depth of CNV (median_cnv_depth) is the median of the Normalized depth of all CNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.
The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).
The Percent of Chromosome Covered by CNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called CNVs.
The Percent of Chromosome Spanned by CNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost CNVs.
color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median CNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
facet_wrap(~chromosome, ncol = 4)+
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median CNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by CNVs")Filter by percent of chromosome in CNV or chromosome depth
size_threshold <- 50
depht_threshold <- 1.55chrom_metrics_filtered <- chrom_metrics %>%
filter(dup_coverage_percent >= size_threshold | norm_chrom_median >= 1.55)ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median CNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
facet_wrap(~chromosome, ncol = 3)+
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median CNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_point(aes(color = n_cnvs)) +
facet_wrap(~chromosome, ncol = 3)+
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
theme_minimal() +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by CNVs")+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "lightgray"),
panel.grid.minor.y = element_blank(),
panel.background = element_rect(fill = "white"),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.border = element_rect(colour = "lightgray", fill=NA, linewidth = 2),
legend.position = "right")ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_point(aes(color = dup_span_percent)) +
facet_wrap(~chromosome, ncol = 3)+
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "BuPu", direction = 1, name = "Percent Spanned\nby CNVs") +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by CNVs")+
theme_minimal()+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "lightgray"),
panel.grid.minor.y = element_blank(),
panel.background = element_rect(fill = "white"),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.border = element_rect(colour = "lightgray", fill=NA, linewidth = 2),
legend.position = "right")write_tsv(chrom_metrics_filtered, duplications_path)chrom_metrics_filtered %>%
filter(norm_chrom_median > 2.5)%>%
select(sample, strain, chromosome, norm_chrom_median, dup_coverage_percent)| sample | strain | chromosome | norm_chrom_median | dup_coverage_percent |
|---|---|---|---|---|
| ERS542397 | 14936_1#6 | chr04 | 2.74 | 64.94 |